This is the primary notebook accompanying the manuscript: "A case-study for improved reusability of plant phenotyping data with MIAPPE". The manuscript references sections numbered below.
Available files (in the data-generated folder and subfolders:
miappe_metadata.ttl: RDF metadata for 5 CE experimentsMIAPPE_CxE_v1.1.xlsx: Excel file with data and metadata - has been converted to miappe-metadata.ttlstation_metadata.ttl: information about the "weather stations" with the weather (mean day temperature, hours of sunlight) for the 5 CE experiments, but no actual dataweather folder:all_weather.ttl combines the content of all files in the weather folder.phenotypic folder:data_199NL.ttl, data_2003VE.ttl, data_2004Fin.ttl, data_2005Fin.ttl, data_2010ET.ttl# setup
import os
import random
import numpy as np
import numpy.polynomial.polynomial as poly
import rdflib
import requests
import SPARQLWrapper
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#from matplotlib.ticker import AutoMinorLocator
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
#from numpy.polynomial import Polynomial
pd.set_option('display.max_colwidth', -1) # or 199
data_loc = 'data-generated' + os.sep
data_weather = data_loc + 'weather' + os.sep
data_pheno = data_loc + 'phenotypic' + os.sep
# helper functions: query_DB, formula_btt, calculate_PBTT
def query_DB(sparql_query):
'''
Query the endpoint with the given query string and return the results as a pandas Dataframe.
'''
# create the connection to the endpoint
sparql_service_url = 'http://fuseki:3030/pheno/query'
#sparql_service_url = 'http://localhost:3030/pheno/query'
sparql = SPARQLWrapper.SPARQLWrapper(sparql_service_url, agent='Sparql Wrapper on Jupyter example')
sparql.setQuery(sparql_query)
sparql.setReturnFormat(SPARQLWrapper.JSON)
# ask for the result
result = sparql.query().convert()
return json_normalize(result['results']['bindings'])
def formula_btt(t):
# fixed values based on potato
tb = 5.5
to = 23.4
tc = 34.6
ct = 1.7
btt = (
((tc - t) / (tc - to)) *
((t - tb) / (to - tb)) **
((to - tb) / (tc - to))
) ** ct
try:
return btt if btt > 0 else 0
except:
return 0
def calculate_PBTT(df_w_cur):
df_w_cur = df_w_cur.dropna()
df_w_cur['mean_temperature'] = pd.to_numeric(df_w_cur['mean_temperature'])
df_w_cur['hours_of_sunlight'] = pd.to_numeric(df_w_cur['hours_of_sunlight'])
#df_w_cur = df_w_cur.where(pd.notnull(df_w_cur), None)
df_w_cur['BTT'] = df_w_cur.apply(
lambda row: formula_btt(row.mean_temperature),
axis = 1)
df_w_cur['day_fraction'] = df_w_cur.apply(
lambda row: row.hours_of_sunlight / 24.0,
axis = 1)
df_w_cur['PBTT'] = df_w_cur.apply(
lambda row: (row.BTT * row.day_fraction) if row.BTT > 0 else 0,
axis = 1)
df_w_cur['cum_PBTT'] = df_w_cur['PBTT'].cumsum()
cum_pbtt = df_w_cur.loc[df_w_cur.index[-1], 'cum_PBTT']
return df_w_cur, cum_pbtt
def autolabel(rects):
'''
Attach a text label above each bar displaying its height
taken from: from https://stackoverflow.com/questions/30228069/how-to-display-the-value-of-the-bar-on-each-bar-with-pyplot-barh
'''
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
round(height,2),
ha='center', va='bottom')
def non_increasing(L):
return all(x>=y for x, y in zip(L, L[1:]))
def non_decreasing(L):
return all(x<=y for x, y in zip(L, L[1:]))
def monotonic(L):
return non_increasing(L) or non_decreasing(L)
We can start with the top MIAPPE object, the Investigation, and examine the properties it has:
# fetch investigation properties
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?investigationProperty
WHERE {
?inv a ppeo:investigation .
?inv ?investigationProperty ?_
}
''')
print('Investigation properties:')
result = query_DB(datatable_query)
columnsToKeep = ['investigationProperty.value']
df_inv = result.loc[:, columnsToKeep]
for c in columnsToKeep:
df_inv.rename(inplace=True, columns={c: c[:-6]})
df_inv
Investigation properties:
| investigationProperty | |
|---|---|
| 0 | http://www.w3.org/1999/02/22-rdf-syntax-ns#type |
| 1 | http://purl.org/ppeo/PPEO.owl#hasName |
| 2 | http://purl.org/ppeo/PPEO.owl#hasAssociatedPublication |
| 3 | http://purl.org/ppeo/PPEO.owl#hasDescription |
| 4 | http://purl.org/ppeo/PPEO.owl#hasIdentifier |
| 5 | http://purl.org/ppeo/PPEO.owl#hasLicense |
| 6 | http://purl.org/ppeo/PPEO.owl#hasMIAPPEVersion |
| 7 | http://purl.org/ppeo/PPEO.owl#hasPart |
| 8 | http://purl.org/ppeo/PPEO.owl#hasPersonWithRole |
| 9 | http://purl.org/ppeo/PPEO.owl#hasPublicReleaseDate |
| 10 | http://purl.org/ppeo/PPEO.owl#hasSubmissionDate |
Next, we can check the values of those properties. For those that are not plain literals, we also choose to show their class here (if given):
# fetch investigation properties and values (no reasoning for transitivity)
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?investigationProperty ?target ?type
WHERE {
?inv a ppeo:investigation ;
?investigationProperty ?target .
OPTIONAL {?target a ?type}
}
ORDER BY ?investigationProperty
''')
result = query_DB(datatable_query)
columnsToKeep = ['investigationProperty.value',
'target.value', 'type.value']
df_inv = result.loc[:, columnsToKeep]
for c in columnsToKeep:
df_inv.rename(inplace=True, columns={c: c[:-6]})
df_inv
| investigationProperty | target | type | |
|---|---|---|---|
| 0 | http://purl.org/ppeo/PPEO.owl#hasAssociatedPublication | https://library.wur.nl/WebQuery/wurpubs/fulltext/240586 | NaN |
| 1 | http://purl.org/ppeo/PPEO.owl#hasDescription | FAIRified, partial data from the 2012 thesis of Paula Ximena Hurtado Lopez | NaN |
| 2 | http://purl.org/ppeo/PPEO.owl#hasIdentifier | WUR_inv_CE2020 | NaN |
| 3 | http://purl.org/ppeo/PPEO.owl#hasLicense | CC-BY 4.0 | NaN |
| 4 | http://purl.org/ppeo/PPEO.owl#hasMIAPPEVersion | 1.1 | NaN |
| 5 | http://purl.org/ppeo/PPEO.owl#hasName | Investigating genotype by environment and QTL by environment interactions for developmental traits in potato | NaN |
| 6 | http://purl.org/ppeo/PPEO.owl#hasPart | http://localhost:3030/wur/fair-example#study/WUR_inv_CE2020_1999NL | http://purl.org/ppeo/PPEO.owl#study |
| 7 | http://purl.org/ppeo/PPEO.owl#hasPart | http://localhost:3030/wur/fair-example#study/WUR_inv_CE2020_2003VE | http://purl.org/ppeo/PPEO.owl#study |
| 8 | http://purl.org/ppeo/PPEO.owl#hasPart | http://localhost:3030/wur/fair-example#study/WUR_inv_CE2020_2004Fin | http://purl.org/ppeo/PPEO.owl#study |
| 9 | http://purl.org/ppeo/PPEO.owl#hasPart | http://localhost:3030/wur/fair-example#study/WUR_inv_CE2020_2005Fin | http://purl.org/ppeo/PPEO.owl#study |
| 10 | http://purl.org/ppeo/PPEO.owl#hasPart | http://localhost:3030/wur/fair-example#study/WUR_inv_CE2020_2010ET | http://purl.org/ppeo/PPEO.owl#study |
| 11 | http://purl.org/ppeo/PPEO.owl#hasPersonWithRole | http://localhost:3030/wur/fair-example#person_role/WUR_inv_CE2020_2010ET_coodinator | NaN |
| 12 | http://purl.org/ppeo/PPEO.owl#hasPersonWithRole | http://localhost:3030/wur/fair-example#person_role/WUR_inv_CE2020_2010ET_corresponding_author | NaN |
| 13 | http://purl.org/ppeo/PPEO.owl#hasPersonWithRole | http://localhost:3030/wur/fair-example#person_role/WUR_inv_CE2020_2010ET_data_author | NaN |
| 14 | http://purl.org/ppeo/PPEO.owl#hasPersonWithRole | http://localhost:3030/wur/fair-example#person_role/WUR_inv_CE2020_2010ET_scientist | NaN |
| 15 | http://purl.org/ppeo/PPEO.owl#hasPersonWithRole | http://localhost:3030/wur/fair-example#person_role/WUR_inv_CE2020_2010ET_scientist__coordinator | NaN |
| 16 | http://purl.org/ppeo/PPEO.owl#hasPublicReleaseDate | 2020-01-01 | NaN |
| 17 | http://purl.org/ppeo/PPEO.owl#hasSubmissionDate | 2020-01-01 | NaN |
| 18 | http://www.w3.org/1999/02/22-rdf-syntax-ns#type | http://purl.org/ppeo/PPEO.owl#investigation | NaN |
We can choose to take a look at studies, the next object in the MIAPPE hierarchy, and look at some of their properties.
# fetch study properties
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?studyId ?cntName ?addr ?locName ?startDate ?endDate ?lat ?long ?alt
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier ?studyId ;
ppeo:hasStartDateTime ?startDate ;
ppeo:hasEndDateTime ?endDate ;
ppeo:hasLocation ?studyloc .
?studyloc ppeo:hasCountry ?cnt ;
ppeo:hasLatitude ?lat ;
ppeo:hasLongitude ?long ;
ppeo:hasAltitude ?alt ;
ppeo:hasAddress ?addr ;
ppeo:hasName ?locName .
?cnt ppeo:hasName ?cntName .
}
ORDER BY ?studyId
''')
result = query_DB(datatable_query)
columnsToKeep = ['studyId.value', 'cntName.value', 'locName.value', 'addr.value', 'startDate.value',
'endDate.value', 'lat.value', 'long.value', 'alt.value']
df_studies = result.loc[:, columnsToKeep]
for c in columnsToKeep:
df_studies.rename(inplace=True, columns={c: c[:-6]})
df_studies
| studyId | cntName | locName | addr | startDate | endDate | lat | long | alt | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999NL | NL | Wageningen | WUR field | 1999-03-12 | 1999-11-14 | 52.0 | 5.63 | 11m |
| 1 | 2003VE | VE | Merida | Mucupiche field in La Fresa | 2003-06-24 | 2003-11-30 | 8.6 | -71.13 | 2200m |
| 2 | 2004Fin | FI | Ruukki | Field in North Ostrobothnia Research Station | 2004-04-16 | 2004-11-10 | 64.7 | 25.0 | 48m |
| 3 | 2005Fin | FI | Ruukki | Field in North Ostrobothnia Research Station | 2005-05-03 | 2005-11-14 | 64.7 | 25.0 | 48m |
| 4 | 2010ET | ET | Holeta | Field in Holeta Agricultural Research Institute | 2010-07-16 | 2010-12-06 | 9.12 | 38.13 | 2400m |
We can easily find the genotypes (biological materials) that were part of all 5 studies:
# common genotypes across 5 studies (intersection across experiments)
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?bio_mat_id
WHERE {
?study1 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '1999NL' .
?study2 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2005Fin' .
?study3 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2003VE' .
?study4 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2004Fin' .
?study5 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier '2010ET' .
?bio_mat ppeo:hasIdentifier ?bio_mat_id .
}
ORDER BY ?bio_mat_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['bio_mat_id.value']
df_geno = result.loc[:, columnsToKeep]
print(str(len(df_geno)) + ' genotypes overlap:')
for c in columnsToKeep:
df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
101 genotypes overlap:
| bio_mat_id | |
|---|---|
| 0 | WUR:CE011 |
| 1 | WUR:CE032 |
| 2 | WUR:CE047 |
| 3 | WUR:CE067 |
| 4 | WUR:CE070 |
We can also calculate the total number of genotypes, i.e. the ones that were studied in at least one experiment:
# all genotypes (union across experiments)
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?bio_mat_id
WHERE {
?study1 a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat .
?bio_mat ppeo:hasIdentifier ?bio_mat_id .
}
ORDER BY ?bio_mat_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['bio_mat_id.value']
df_geno = result.loc[:, columnsToKeep]
print(str(len(df_geno)) + ' genotypes in total:')
for c in columnsToKeep:
df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
292 genotypes in total:
| bio_mat_id | |
|---|---|
| 0 | WUR:ASTARTE |
| 1 | WUR:AWASH |
| 2 | WUR:BELETE |
| 3 | WUR:BINTJE |
| 4 | WUR:BULLE |
# genotypes per experiment
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT (?study_id as ?Study_ID) (count(distinct ?bio_mat_id) as ?genotypes)
WHERE {
?study a ppeo:study ;
ppeo:hasBiologicalMaterial ?bio_mat ;
ppeo:hasIdentifier ?study_id .
?bio_mat ppeo:hasIdentifier ?bio_mat_id .
} GROUP BY ?study_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['Study_ID.value', 'genotypes.value']
df_geno = result.loc[:, columnsToKeep]
print('Genotypes per experiment:')
for c in columnsToKeep:
df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
Genotypes per experiment:
| Study_ID | genotypes | |
|---|---|---|
| 0 | 2005Fin | 260 |
| 1 | 2003VE | 107 |
| 2 | 2010ET | 190 |
| 3 | 2004Fin | 251 |
| 4 | 1999NL | 242 |
# pull stations from /weather with service
datatable_query = (
'''
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?name ?latitude ?longitude ?altitude
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?name ;
geo:location ?loc .
?loc geo:lat ?latitude ;
geo:long ?longitude ;
geo:alt ?altitude .
}
} ORDER BY ASC(?obsTime)
''')
result = query_DB(datatable_query)
columnsToKeep = ['name.value', 'latitude.value',
'longitude.value', 'altitude.value']
df_stations = result[columnsToKeep]
for c in columnsToKeep:
df_stations.rename(inplace=True, columns={c: c[:-6]})
df_stations
| name | latitude | longitude | altitude | |
|---|---|---|---|---|
| 0 | WUR station | 52 | 5.63 | 11 |
| 1 | Merida station | 8.6 | -71.13 | 2200 |
| 2 | Holeta station | 9.12 | 38 | 2400 |
| 3 | Ruuki station | 64.7 | 25 | 48 |
# pull NL sunhours from /weather with service
datatable_query = (
'''
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?obsTime ?obsVal
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?obs a aemet:Observation ;
aemet:valueOfObservedData ?obsVal ;
aemet:observedInInterval ?obsTime ;
ssn:observedProperty ?var ;
ssn:observedBy ?station .
?station aemet:stationName ?obsStation .
FILTER("WUR station"=?obsStation) .
FILTER(?obsTime > "1999-05-11"^^xsd:date &&
?obsTime < "1999-11-15"^^xsd:date)
?var a aemet:AmbientProperty ;
rdfs:label "Hours of sunlight" .
}
} ORDER BY ASC(?obsTime)
''')
result = query_DB(datatable_query)
columnsToKeep = ['obsTime.value', 'obsVal.value']
df_nl_sun = result[columnsToKeep]
for c in columnsToKeep:
df_nl_sun.rename(inplace=True, columns={c: c[:-6]})
display(df_nl_sun.head())
| obsTime | obsVal | |
|---|---|---|
| 0 | 1999-05-12 | 15.54 |
| 1 | 1999-05-13 | 15.59 |
| 2 | 1999-05-14 | 15.64 |
| 3 | 1999-05-15 | 15.69 |
| 4 | 1999-05-16 | 15.75 |
Plot (hours of sunlight):
# make plot
df_nl_sun.loc[:, 'obsTime'] = pd.to_datetime(df_nl_sun['obsTime'])
df_nl_sun.loc[:, 'obsVal'] = pd.to_numeric(df_nl_sun['obsVal'], downcast="float")
pd.plotting.register_matplotlib_converters()
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(df_nl_sun['obsTime'], df_nl_sun['obsVal'])
ax.set_title('Daily hours of sunlight (NL)')
plt.ylabel('Hours per day')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
# check distances from experimental locations to weather station coordinates
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT distinct ?studyId ?cntName ?stu_locName ?stu_lat ?stu_long ?w_name ?w_lat ?w_long ?dist_sq
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier ?studyId ;
ppeo:hasLocation ?studyloc .
?studyloc ppeo:hasCountry ?cnt ;
ppeo:hasLatitude ?stu_lat ;
ppeo:hasLongitude ?stu_long ;
ppeo:hasName ?stu_locName .
?cnt ppeo:hasName ?cntName .
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?w_name ;
geo:location ?w_loc .
?w_loc geo:lat ?w_lat ; geo:long ?w_long .
}
BIND(((?stu_lat - ?w_lat) * (?stu_lat - ?w_lat)) AS ?lat_diff_sq)
BIND(((?stu_long - ?w_long) * (?stu_long - ?w_long)) AS ?long_diff_sq)
BIND((?lat_diff_sq + ?long_diff_sq) AS ?dist_sq)
}
ORDER BY ?dist_sq
''')
result = query_DB(datatable_query)
columnsToKeep = ['studyId.value', 'cntName.value', 'stu_locName.value', 'stu_lat.value', 'stu_long.value',
'w_name.value', 'w_lat.value', 'w_long.value', 'dist_sq.value']
df_distances = result[columnsToKeep]
for c in columnsToKeep:
df_distances.rename(inplace=True, columns={c: c[:-6]})
display(df_distances)
| studyId | cntName | stu_locName | stu_lat | stu_long | w_name | w_lat | w_long | dist_sq | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2004Fin | FI | Ruukki | 64.7 | 25.0 | Ruuki station | 64.7 | 25 | 0.0 |
| 1 | 2005Fin | FI | Ruukki | 64.7 | 25.0 | Ruuki station | 64.7 | 25 | 0.0 |
| 2 | 1999NL | NL | Wageningen | 52.0 | 5.63 | WUR station | 52 | 5.63 | 0.0 |
| 3 | 2003VE | VE | Merida | 8.6 | -71.13 | Merida station | 8.6 | -71.13 | 0.0 |
| 4 | 2010ET | ET | Holeta | 9.12 | 38.13 | Holeta station | 9.12 | 38 | 0.016900279 |
| 5 | 2004Fin | FI | Ruukki | 64.7 | 25.0 | WUR station | 52 | 5.63 | 536.4868 |
| 6 | 2005Fin | FI | Ruukki | 64.7 | 25.0 | WUR station | 52 | 5.63 | 536.4868 |
| 7 | 1999NL | NL | Wageningen | 52.0 | 5.63 | Ruuki station | 64.7 | 25 | 536.4868 |
| 8 | 1999NL | NL | Wageningen | 52.0 | 5.63 | Holeta station | 9.12 | 38 | 2886.5112 |
| 9 | 2010ET | ET | Holeta | 9.12 | 38.13 | WUR station | 52 | 5.63 | 2894.9443 |
| 10 | 2004Fin | FI | Ruukki | 64.7 | 25.0 | Holeta station | 9.12 | 38 | 3258.1362 |
| 11 | 2005Fin | FI | Ruukki | 64.7 | 25.0 | Holeta station | 9.12 | 38 | 3258.1362 |
| 12 | 2010ET | ET | Holeta | 9.12 | 38.13 | Ruuki station | 64.7 | 25 | 3261.5332 |
| 13 | 1999NL | NL | Wageningen | 52.0 | 5.63 | Merida station | 8.6 | -71.13 | 7775.6567 |
| 14 | 2003VE | VE | Merida | 8.6 | -71.13 | WUR station | 52 | 5.63 | 7775.6567 |
| 15 | 2003VE | VE | Merida | 8.6 | -71.13 | Holeta station | 9.12 | 38 | 11909.627 |
| 16 | 2010ET | ET | Holeta | 9.12 | 38.13 | Merida station | 8.6 | -71.13 | 11938.017 |
| 17 | 2004Fin | FI | Ruukki | 64.7 | 25.0 | Merida station | 8.6 | -71.13 | 12388.187 |
| 18 | 2005Fin | FI | Ruukki | 64.7 | 25.0 | Merida station | 8.6 | -71.13 | 12388.187 |
| 19 | 2003VE | VE | Merida | 8.6 | -71.13 | Ruuki station | 64.7 | 25 | 12388.187 |
In this section we read in the data files for each study. We can view the variables in a data file for a study as follows:
# variables for 1999NL
datatable_query = ('''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT distinct ?filename ?varName
WHERE {
?file ppeo:hasObservation ?obs ;
ppeo:hasDigitalLocation ?filename .
?study ppeo:hasDataFile ?file ;
ppeo:hasIdentifier '1999NL' .
?obs ppeo:hasVariable ?var .
?study ppeo:hasPart ?var .
?var ppeo:hasIdentifier ?varName .
}
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'varName.value']
df_xvars = result[columnsToKeep]
for c in columnsToKeep:
df_xvars.rename(inplace=True, columns={c: c[:-6]})
df_xvars
| filename | varName | |
|---|---|---|
| 0 | data_1999NL.csv | TubN_average_per_genotype |
| 1 | data_1999NL.csv | TubN_total_per_plant |
| 2 | data_1999NL.csv | TubW_average_per_genotype |
| 3 | data_1999NL.csv | TubW_total_per_plant |
Create the data table for this study. We will also add a column to hold the genotypes for later use.
(The code for putting SPARQLWrapper output into a dataframe is based on this page)
# 1999NL data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?bmID ?TubN_total_per_plant ?TubW_total_per_plant # ?TubN_average_per_genotype ?TubW_average_per_genotype
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '1999NL' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename .
?file ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubN_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubN_total_per_plant .
OPTIONAL {?file ppeo:hasObservation ?obs2 .
?var2 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs2 ppeo:hasVariable ?var2 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubW_total_per_plant .}
# OPTIONAL {?file ppeo:hasObservation ?obs3 .
# ?var3 a ppeo:observed_variable ;
# ppeo:hasIdentifier 'TubN_average_per_genotype' .
# ?obs3 ppeo:hasVariable ?var3 ;
# ppeo:hasDateTime ?date ;
# ppeo:hasObservedSubject ?ou ;
# ppeo:hasValue ?TubN_average_per_genotype .}
#
# OPTIONAL {?file ppeo:hasObservation ?obs4 .
# ?var4 a ppeo:observed_variable ;
# ppeo:hasIdentifier 'TubW_average_per_genotype' .
# ?obs4 ppeo:hasVariable ?var4 ;
# ppeo:hasDateTime ?date ;
# ppeo:hasObservedSubject ?ou ;
# ppeo:hasValue ?TubW_average_per_genotype .}
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 'bmID.value',
'TubN_total_per_plant.value', 'TubW_total_per_plant.value']
datatable_1999NL = result[columnsToKeep]
for c in columnsToKeep:
datatable_1999NL.rename(inplace=True, columns={c: c[:-6]})
datatable_1999NL.loc[:, 'TubW_total_per_plant'] = pd.to_numeric(datatable_1999NL['TubW_total_per_plant'], downcast="float")
datatable_1999NL_perPlant = datatable_1999NL.groupby('bmID')['TubW_total_per_plant'].mean()
display(datatable_1999NL.head())
display(datatable_1999NL.describe())
| filename | date | ouID | bmID | TubN_total_per_plant | TubW_total_per_plant | |
|---|---|---|---|---|---|---|
| 0 | data_1999NL.csv | 1999-11-14 | WUR:CE-NL99-WUR:CE011-1 | WUR:CE011 | 6.1e+01 | 561.559998 |
| 1 | data_1999NL.csv | 1999-11-14 | WUR:CE-NL99-WUR:CE011-2 | WUR:CE011 | 4.6e+01 | 1212.630005 |
| 2 | data_1999NL.csv | 1999-11-14 | WUR:CE-NL99-WUR:CE016-1 | WUR:CE016 | 4.9e+01 | 2637.739990 |
| 3 | data_1999NL.csv | 1999-11-14 | WUR:CE-NL99-WUR:CE016-2 | WUR:CE016 | 2.1e+01 | 608.919983 |
| 4 | data_1999NL.csv | 1999-11-14 | WUR:CE-NL99-WUR:CE017-1 | WUR:CE017 | 2.5e+01 | 395.899994 |
| TubW_total_per_plant | |
|---|---|
| count | 452.000000 |
| mean | 1380.570801 |
| std | 794.384644 |
| min | 24.910000 |
| 25% | 715.117477 |
| 50% | 1330.800049 |
| 75% | 1877.225006 |
| max | 4220.200195 |
# calculate average weight per genotype
# 1999NL data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '1999NL' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_1999NL_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_1999NL_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_1999NL_avg.head()
| bmID | TubW_T_avg | |
|---|---|---|
| 0 | WUR:CE175 | 1190.325e0 |
| 1 | WUR:CE296 | 670.5250000000001e0 |
| 2 | WUR:CE640 | 1167.605e0 |
| 3 | WUR:CE761 | 1748.4699999999998e0 |
| 4 | WUR:CE652 | 1759.3e0 |
Once again, when building the data table, we will also add a column to hold the genotypes for later use.
# 2003VE data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?bmID ?TubN_average_per_genotype ?TubW_average_per_genotype
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2003VE' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename .
OPTIONAL {?file ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubN_average_per_genotype' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubN_average_per_genotype .}
OPTIONAL {?file ppeo:hasObservation ?obs2 .
?var2 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_average_per_genotype' .
?obs2 ppeo:hasVariable ?var2 ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou ;
ppeo:hasValue ?TubW_average_per_genotype .}
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 'bmID.value',
'TubN_average_per_genotype.value', 'TubW_average_per_genotype.value']
datatable_2003VE = result[columnsToKeep]
for c in columnsToKeep:
datatable_2003VE.rename(inplace=True, columns={c: c[:-6]})
datatable_2003VE.loc[:, 'TubW_average_per_genotype'] = pd.to_numeric(datatable_2003VE['TubW_average_per_genotype'], downcast="float")
datatable_2003VE_perPlant = datatable_2003VE.groupby('bmID')['TubW_average_per_genotype'].mean()
display(datatable_2003VE.head())
display(datatable_2003VE.describe())
| filename | date | ouID | bmID | TubN_average_per_genotype | TubW_average_per_genotype | |
|---|---|---|---|---|---|---|
| 0 | data_2003VE.csv | 2003-11-30 | WUR:CE-VEN03-CE011 | WUR:CE011 | 1.85e+01 | 142.5 |
| 1 | data_2003VE.csv | 2003-11-30 | WUR:CE-VEN03-CE032 | WUR:CE032 | 8e+00 | 237.5 |
| 2 | data_2003VE.csv | 2003-11-30 | WUR:CE-VEN03-CE047 | WUR:CE047 | 2.1e+01 | 230.0 |
| 3 | data_2003VE.csv | 2003-11-30 | WUR:CE-VEN03-CE067 | WUR:CE067 | 1.15e+01 | 62.5 |
| 4 | data_2003VE.csv | 2003-11-30 | WUR:CE-VEN03-CE070 | WUR:CE070 | 7e+00 | 125.0 |
| TubW_average_per_genotype | |
|---|---|
| count | 107.000000 |
| mean | 121.906540 |
| std | 110.696785 |
| min | 8.000000 |
| 25% | 56.000000 |
| 50% | 100.000000 |
| 75% | 167.250000 |
| max | 975.000000 |
The data for Ethiopia is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant:
# 2010ET data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2010ET' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study.
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value',
'TubW_total_per_plant.value']
datatable_2010ET = result[columnsToKeep]
for c in columnsToKeep:
datatable_2010ET .rename(inplace=True, columns={c: c[:-6]})
datatable_2010ET.head()
| filename | date | ouID | TubW_total_per_plant | |
|---|---|---|---|---|
| 0 | data_2010ET.csv | 2010-12-01 | WUR:CE-ETH10-1-1-1 | 550 |
| 1 | data_2010ET.csv | 2010-12-01 | WUR:CE-ETH10-1-1-2 | 191 |
| 2 | data_2010ET.csv | 2010-12-01 | WUR:CE-ETH10-1-1-3 | 535 |
| 3 | data_2010ET.csv | 2010-12-01 | WUR:CE-ETH10-1-1-4 | 250 |
| 4 | data_2010ET.csv | 2010-12-01 | WUR:CE-ETH10-1-10-1 | 300 |
At the same time, we only have the average tuber weight per genotype for the Netherlands and Venezuela.
We can calculate the same thing for Ethiopia and Finland. (new table datatable_2010ET2)
# 2010ET data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2010ET' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2010ET_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_2010ET_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2010ET_avg.head()
| bmID | TubW_T_avg | |
|---|---|---|
| 0 | WUR:CE680 | 143.666666666666666666666667 |
| 1 | WUR:CE692 | 446.666666666666666666666667 |
| 2 | WUR:CE640 | 177.416666666666666666666667 |
| 3 | WUR:GUDENE | 581.666666666666666666666667 |
| 4 | WUR:CE761 | 137.5 |
The data for Finland is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant, for block 8 (the one that was harvested last):
# 2004Fin data table
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2004Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "8" .
?sdtype ppeo:hasType "block" .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value',
'TubW_total_per_plant.value']
datatable_2004Fin = result[columnsToKeep]
for c in columnsToKeep:
datatable_2004Fin .rename(inplace=True, columns={c: c[:-6]})
# divide by 3 since there were 3 plants
datatable_2004Fin.loc[:, 'TubW_total_per_plant'] = pd.to_numeric(datatable_2004Fin['TubW_total_per_plant'], downcast="float") / 3
datatable_2004Fin.head()
| filename | date | ouID | TubW_total_per_plant | |
|---|---|---|---|---|
| 0 | data_2004Fin.csv | 2004-09-20 | WUR:CE-FIN04-8-1561-A | 789.363342 |
| 1 | data_2004Fin.csv | 2004-09-20 | WUR:CE-FIN04-8-1561-B | 669.080017 |
| 2 | data_2004Fin.csv | 2004-09-20 | WUR:CE-FIN04-8-1561-C | 587.989990 |
| 3 | data_2004Fin.csv | 2004-09-20 | WUR:CE-FIN04-8-1562-A | 667.753357 |
| 4 | data_2004Fin.csv | 2004-09-20 | WUR:CE-FIN04-8-1562-B | 753.156677 |
Calculating the average tuber weight per genotype:
# 2004Fin data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2004Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study ;
ppeo:hasBiologicalMaterial ?bm ;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "8" .
?sdtype ppeo:hasType "block" .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2004Fin_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_2004Fin_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2004Fin_avg.loc[:, 'TubW_T_avg_perPlant'] = pd.to_numeric(datatable_2004Fin_avg['TubW_T_avg'], downcast="float") / 3
datatable_2004Fin_avg.head()
| bmID | TubW_T_avg | TubW_T_avg_perPlant | |
|---|---|---|---|
| 0 | WUR:CE175 | 2784.433333333333e0 | 928.144470 |
| 1 | WUR:VOYAGER | 1929.8799999999999e0 | 643.293335 |
| 2 | WUR:CE640 | 2592.2066666666665e0 | 864.068909 |
| 3 | WUR:CE773 | 3390.536666666667e0 | 1130.178833 |
| 4 | WUR:CE664 | 2423.9966666666664e0 | 807.998840 |
# 2004Findata - plant height
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = ('''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?date ?ouID ?bmID ?plantHeight
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2004Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'PlantHeight' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?plantHeight ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasBiologicalMaterial ?bm .
?bm ppeo:hasIdentifier ?bmID .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['date.value', 'ouID.value', 'bmID.value', 'plantHeight.value']
datatable_2004Fin_ph = result[columnsToKeep]
for c in columnsToKeep:
datatable_2004Fin_ph.rename(inplace=True, columns={c: c[:-6]})
datatable_2004Fin_ph.head()
# get mean for each genotype:
datatable_2004Fin_ph2 = datatable_2004Fin_ph[datatable_2004Fin_ph.plantHeight != '*']
datatable_2004Fin_ph2.loc[:, 'plantHeight'] = pd.to_numeric(datatable_2004Fin_ph2['plantHeight'], downcast="float")
# averages per plot:
datatable_2004Fin_ph3 = datatable_2004Fin_ph2.assign(plot=datatable_2004Fin_ph2['ouID'].str[:-2])
datatable_2004Fin_ph3['bmDateOu'] = (datatable_2004Fin_ph3['bmID'] + ';' + datatable_2004Fin_ph3['date'] +
';' + datatable_2004Fin_ph3['plot'])
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph3.groupby('bmDateOu')['plantHeight'].mean()
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph_perPlant.to_frame().reset_index()
datatable_2004Fin_ph_perPlant[['bmID','Date','plot']] = datatable_2004Fin_ph_perPlant['bmDateOu'].str.split(';',expand=True,)
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph_perPlant.drop('bmDateOu', 1)
# averages per genotype:
datatable_2004Fin_ph4 = datatable_2004Fin_ph2
datatable_2004Fin_ph4['bmDate'] = (datatable_2004Fin_ph2['bmID'] + ';' + datatable_2004Fin_ph2['date'])
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph4.groupby('bmDate')['plantHeight'].mean()
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph_perPlant4.to_frame().reset_index()
datatable_2004Fin_ph_perPlant4[['bmID','Date']] = datatable_2004Fin_ph_perPlant4['bmDate'].str.split(';',expand=True,)
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph_perPlant4.drop('bmDate', 1)
datatable_2004Fin_ph_perPlant4.loc[:, 'Date'] = pd.to_datetime(datatable_2004Fin_ph_perPlant4['Date'])
datatable_2004Fin_ph_perPlant4['planting'] = pd.to_datetime('2004-06-01') # planting date: 16 july 2010
datatable_2004Fin_ph_perPlant4['DAP'] = (datatable_2004Fin_ph_perPlant4['Date'] -
datatable_2004Fin_ph_perPlant4['planting']).dt.days
datatable_2004Fin_ph_perPlant4.head()
| plantHeight | bmID | Date | planting | DAP | |
|---|---|---|---|---|---|
| 0 | 20.500000 | WUR:ASTARTE | 2004-06-30 | 2004-06-01 | 29 |
| 1 | 30.666666 | WUR:ASTARTE | 2004-07-07 | 2004-06-01 | 36 |
| 2 | 42.333332 | WUR:ASTARTE | 2004-07-13 | 2004-06-01 | 42 |
| 3 | 59.666668 | WUR:ASTARTE | 2004-07-20 | 2004-06-01 | 49 |
| 4 | 60.500000 | WUR:ASTARTE | 2004-07-27 | 2004-06-01 | 56 |
The data for Finland is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant:
# 2005Fin data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2005Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "IV" .
?sdtype ppeo:hasType "block" .
}
ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value',
'TubW_total_per_plant.value']
datatable_2005Fin = result[columnsToKeep]
for c in columnsToKeep:
datatable_2005Fin .rename(inplace=True, columns={c: c[:-6]})
datatable_2005Fin.head()
| filename | date | ouID | TubW_total_per_plant | |
|---|---|---|---|---|
| 0 | data_2005Fin.csv | 2005-09-20 | WUR:CE-FIN05-IV-1000-A | 3.44e+02 |
| 1 | data_2005Fin.csv | 2005-09-20 | WUR:CE-FIN05-IV-1000-B | 1.0105e+03 |
| 2 | data_2005Fin.csv | 2005-09-20 | WUR:CE-FIN05-IV-1000-C | 8.205e+02 |
| 3 | data_2005Fin.csv | 2005-09-20 | WUR:CE-FIN05-IV-1001-A | 1.184e+03 |
| 4 | data_2005Fin.csv | 2005-09-20 | WUR:CE-FIN05-IV-1001-B | 5.98e+02 |
Calculating the average tuber weight per genotype:
# 2005Fin data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
SELECT DISTINCT ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
WHERE {
?study a ppeo:study ;
ppeo:hasIdentifier '2005Fin' ;
ppeo:hasDataFile ?file .
?file a ppeo:data_file ;
ppeo:hasDigitalLocation ?filename ;
ppeo:hasObservation ?obs1 .
?var1 a ppeo:observed_variable ;
ppeo:hasIdentifier 'TubW_total_per_plant' .
?obs1 ppeo:hasVariable ?var1 ;
ppeo:hasValue ?TubW_total_per_plant ;
ppeo:hasDateTime ?date ;
ppeo:hasObservedSubject ?ou .
?ou a ppeo:observation_unit ;
ppeo:hasIdentifier ?ouID ;
ppeo:partOf ?study;
ppeo:hasBiologicalMaterial ?bm ;
ppeo:hasSpatialDistribution ?sd .
?sd ppeo:hasSpatialDistributionType ?sdtype .
?sd ppeo:hasValue "IV" .
?sdtype ppeo:hasType "block" .
?bm ppeo:hasIdentifier ?bmID .
}
GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2005Fin_avg = result[columnsToKeep]
for c in columnsToKeep:
datatable_2005Fin_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2005Fin_avg.head()
| bmID | TubW_T_avg | |
|---|---|---|
| 0 | WUR:CE175 | 491.5e0 |
| 1 | WUR:CE296 | 507.6666666666667e0 |
| 2 | WUR:VOYAGER | 1022.0e0 |
| 3 | WUR:CE640 | 857.6666666666666e0 |
| 4 | WUR:CE761 | 1151.6666666666667e0 |
# for average weight per genotype
datatable_2004Fin_avg2 = datatable_2004Fin_avg.drop('TubW_T_avg', axis=1)
finland_merge = pd.merge(datatable_2004Fin_avg2.rename(columns={'TubW_T_avg_perPlant': '2004Fin'}),
datatable_2005Fin_avg.rename(columns={'TubW_T_avg': '2005Fin'}), on= 'bmID')
fin_et = pd.merge(datatable_2010ET_avg.rename(columns={'TubW_T_avg': '2010ET'}),
finland_merge, on='bmID')
fin_et_ven = pd.merge(datatable_2003VE_perPlant, fin_et, on='bmID')
fin_et_ven.rename(columns={'TubW_average_per_genotype': '2003VE'}, inplace=True)
fin_et_ven_nl = pd.merge(datatable_1999NL_perPlant, fin_et_ven, on='bmID')
fin_et_ven_nl.rename(columns={'TubW_total_per_plant': '1999NL'}, inplace=True)
df_averageWeightPerGenotype = fin_et_ven_nl
df_averageWeightPerGenotype[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
] = df_averageWeightPerGenotype[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
].apply(pd.to_numeric)
df_averageWeightPerGenotype = df_averageWeightPerGenotype.sort_values('bmID')
display(df_averageWeightPerGenotype.head())
display(df_averageWeightPerGenotype.describe())
# pd.set_option('display.max_rows', 300)
| bmID | 1999NL | 2003VE | 2010ET | 2004Fin | 2005Fin | |
|---|---|---|---|---|---|---|
| 0 | WUR:CE011 | 887.094971 | 142.5 | 254.916667 | 861.528870 | 392.166667 |
| 1 | WUR:CE047 | 707.325012 | 230.0 | 442.083333 | 944.598877 | 570.833333 |
| 2 | WUR:CE067 | 956.275024 | 62.5 | 300.416667 | 674.260010 | 713.166667 |
| 3 | WUR:CE070 | 1371.640015 | 125.0 | 234.090909 | 678.725525 | 560.000000 |
| 4 | WUR:CE100 | 632.919983 | 127.5 | 366.500000 | 607.843323 | 1694.500000 |
| 1999NL | 2003VE | 2010ET | 2004Fin | 2005Fin | |
|---|---|---|---|---|---|
| count | 80.000000 | 80.000000 | 80.000000 | 80.000000 | 80.000000 |
| mean | 1280.952881 | 126.162498 | 259.687519 | 853.342163 | 764.065000 |
| std | 631.450073 | 116.615913 | 87.935837 | 170.252243 | 313.102156 |
| min | 255.299988 | 17.000000 | 117.272727 | 604.811096 | 266.666667 |
| 25% | 827.912506 | 65.625000 | 198.079545 | 723.781647 | 556.875000 |
| 50% | 1247.385010 | 100.000000 | 236.765152 | 828.277740 | 705.500000 |
| 75% | 1671.129944 | 159.125000 | 287.916667 | 971.573898 | 919.666667 |
| max | 3040.670166 | 975.000000 | 567.083333 | 1386.522339 | 1694.500000 |
# plotly graph for overlapping genotypes
colors = px.colors.qualitative.Plotly
layout = go.Layout(
legend=dict(
x=0, y=1,
orientation='h',
title_font_family='Tahoma',
font=dict(family='Tahoma', size=9, color='black'),
bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
),
width=1000,
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'Genotype', 'dtick':1, 'tickangle': 85,
'rangemode': 'tozero', 'range': [-1, len(df_averageWeightPerGenotype) + 1]},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
fig.add_traces(go.Scatter(x=df_averageWeightPerGenotype['bmID'],
y=df_averageWeightPerGenotype[exp],
mode='lines+markers', line=dict(color=col), name=exp))
fig.show()
# pio.write_image(fig, '5_weight_per_plant_overlap.pdf', width=1100, height=500)
# for average weight per genotype
all_datatable_2004Fin_avg2 = datatable_2004Fin_avg.drop('TubW_T_avg', axis=1)
all_finland_merge = pd.merge(all_datatable_2004Fin_avg2.rename(columns={'TubW_T_avg_perPlant': '2004Fin'}),
datatable_2005Fin_avg.rename(columns={'TubW_T_avg': '2005Fin'}),
on= 'bmID', how='outer')
all_fin_et = pd.merge(datatable_2010ET_avg.rename(columns={'TubW_T_avg': '2010ET'}),
all_finland_merge, on='bmID', how='outer')
all_fin_et_ven = pd.merge(datatable_2003VE_perPlant, all_fin_et, on='bmID', how='outer')
all_fin_et_ven.rename(columns={'TubW_average_per_genotype': '2003VE'}, inplace=True)
all_fin_et_ven_nl = pd.merge(datatable_1999NL_perPlant, all_fin_et_ven, on='bmID', how='outer')
all_fin_et_ven_nl.rename(columns={'TubW_total_per_plant': '1999NL'}, inplace=True)
df_averageWeightPerGenotype_all = all_fin_et_ven_nl
df_averageWeightPerGenotype_all[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
] = df_averageWeightPerGenotype_all[
['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
].apply(pd.to_numeric)
df_averageWeightPerGenotype_all = df_averageWeightPerGenotype_all.sort_values('bmID')
display(df_averageWeightPerGenotype_all.head())
display(df_averageWeightPerGenotype_all.describe())
pd.set_option('display.max_rows', 300)
# registered genotypes per study:
# 2005Fin: 260, 2003VE: 107, 2010ET: 190, 2004Fin: 251, 1999NL: 242
| bmID | 1999NL | 2003VE | 2010ET | 2004Fin | 2005Fin | |
|---|---|---|---|---|---|---|
| 281 | WUR:ASTARTE | NaN | NaN | NaN | 662.997742 | 1451.666667 |
| 245 | WUR:AWASH | NaN | NaN | 391.727273 | NaN | NaN |
| 259 | WUR:BELETE | NaN | NaN | 552.727273 | NaN | NaN |
| 250 | WUR:BINTJE | NaN | NaN | 257.500000 | 618.892212 | 1386.833333 |
| 254 | WUR:BULLE | NaN | NaN | 739.166667 | NaN | NaN |
| 1999NL | 2003VE | 2010ET | 2004Fin | 2005Fin | |
|---|---|---|---|---|---|
| count | 233.000000 | 107.000000 | 188.000000 | 216.000000 | 253.000000 |
| mean | 1389.825928 | 121.906540 | 275.111095 | 842.187073 | 732.270290 |
| std | 678.495422 | 110.696785 | 119.832693 | 186.580933 | 315.822943 |
| min | 44.004997 | 8.000000 | 97.500000 | 575.451111 | 130.166667 |
| 25% | 855.815002 | 56.000000 | 197.585859 | 699.551514 | 507.666667 |
| 50% | 1339.489990 | 100.000000 | 240.916667 | 798.835571 | 646.000000 |
| 75% | 1781.599976 | 167.250000 | 317.250000 | 966.128601 | 921.666667 |
| max | 3723.739990 | 975.000000 | 797.666667 | 1404.246704 | 1694.500000 |
# plotly graph for all genotypes
layout = go.Layout(
legend=dict(
x=0, y=1,
orientation='h',
title_font_family='Tahoma',
font=dict(family='Tahoma', size=9, color='black'),
bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
),
width=1200, #3200 is a good size for all labels - when dtick = 1
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'Genotype', 'dtick':3, 'tickangle': 85,
'rangemode': 'tozero', 'range': [-2, len(df_averageWeightPerGenotype_all) + 2]},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
fig.add_traces(go.Scatter(x=df_averageWeightPerGenotype_all['bmID'],
y=df_averageWeightPerGenotype_all[exp],
mode='markers', line=dict(color=col), name=exp))
fig.show()
# pio.write_image(fig, '6-1_weight_per_plant_all.pdf', width=1100, height=500)
# stacked bar graph for all genotypes
df_averageWeightPerGenotype_all_rev = df_averageWeightPerGenotype_all.sort_values('bmID', ascending=False)
# plotly graph for all genotypes
colors = px.colors.qualitative.Plotly
layout = go.Layout(
legend=dict(
xanchor='right',
x=1,
yanchor='top',
y=1,
orientation='v',
title_font_family='Tahoma',
font=dict(family='Tahoma', size=9, color='black'),
bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
),
width=800, #3200 is a good size for all labels - when dtick = 1
height=2500,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title': 'plant tuber weight (g)',
'dtick':200, 'tickangle': 60,
#'rangemode': 'tozero', 'range': [-2, len(df_averageWeightPerGenotype_all) + 2]
},
yaxis={'title':'Genotype', 'dtick': 1, 'tickfont':dict(family='Tahoma', size=7, color='black')},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
fig.add_trace(go.Bar(
y=df_averageWeightPerGenotype_all_rev['bmID'],
x=df_averageWeightPerGenotype_all_rev[exp],
width=0.8,
name=exp,
orientation='h',
opacity=0.6,
))
fig.update_layout(barmode='stack')
fig.show()
fig.write_html('stacked_bars.html')
fig.write_image('stacked_bars.svg')
For Netherlands 1999 (df_weather_1999NL):
# 1999NL weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_1999NL = result[columnsToKeep]
for c in columnsToKeep:
df_weather_1999NL.rename(inplace=True, columns={c: c[:-6]})
For Venezuela 2003 (df_weather_2003VE):
# 2003VE weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2003VE = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2003VE.rename(inplace=True, columns={c: c[:-6]})
For Finland 2004 (df_weather_2004Fin):
# 2004Fin weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2004Fin = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2004Fin.rename(inplace=True, columns={c: c[:-6]})
df_weather_2004Fin.head()
| wsName | date | hours_of_sunlight | mean_temperature | |
|---|---|---|---|---|
| 0 | Ruuki station | 2004-06-01 | 19.71 | 10.0 |
| 1 | Ruuki station | 2004-06-02 | 19.79 | 10.2 |
| 2 | Ruuki station | 2004-06-03 | 19.86 | 12.2 |
| 3 | Ruuki station | 2004-06-04 | 19.93 | 12.0 |
| 4 | Ruuki station | 2004-06-05 | 20.0 | 13.6 |
For Finland 2005 (df_weather_2005Fin):
# 2005Fin weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
#FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2005Fin = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2005Fin.rename(inplace=True, columns={c: c[:-6]})
df_weather_2005Fin.head()
| wsName | date | hours_of_sunlight | mean_temperature | |
|---|---|---|---|---|
| 0 | Ruuki station | 2005-05-16 | 18.23 | 4.8 |
| 1 | Ruuki station | 2005-05-17 | 18.33 | 9.0 |
| 2 | Ruuki station | 2005-05-18 | 18.43 | 7.0 |
| 3 | Ruuki station | 2005-05-19 | 18.53 | 4.9 |
| 4 | Ruuki station | 2005-05-20 | 18.62 | 8.5 |
For Ethiopia 2010 (df_weather_2010ET):
# 2010ET weather data
datatable_query = (
'''
PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
PREFIX w3ctime: <http://www.w3.org/2006/time#>
PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature
WHERE {
SERVICE <http://localhost:3030/weather/query> {
?ws a aemet:WeatherStation ;
aemet:stationName ?wsName .
?obs1 a aemet:Observation ;
aemet:valueOfObservedData ?hours_of_sunlight ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var1 .
?obs2 a aemet:Observation ;
aemet:valueOfObservedData ?mean_temperature ;
ssn:observedBy ?ws;
aemet:observedInInterval ?date ;
ssn:observedProperty ?var2 .
?var1 rdfs:label "Hours of sunlight" .
?var2 rdfs:label "Mean temperature" .
FILTER(?wsName IN('Holeta station'))
#FILTER(?wsName IN('Merida station'))
#FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
#FILTER(?wsName IN('WUR station'))
}
} ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2010ET = result[columnsToKeep]
for c in columnsToKeep:
df_weather_2010ET.rename(inplace=True, columns={c: c[:-6]})
df_weather_2010ET.head()
| wsName | date | hours_of_sunlight | mean_temperature | |
|---|---|---|---|---|
| 0 | Holeta station | 2010-07-17 | 12.59 | 13.875 |
| 1 | Holeta station | 2010-07-18 | 12.59 | 14.5625 |
| 2 | Holeta station | 2010-07-19 | 12.58 | 14.35 |
| 3 | Holeta station | 2010-07-20 | 12.58 | 13.4625 |
| 4 | Holeta station | 2010-07-21 | 12.57 | 13.1375 |
Beta thermal days: $\Large g \left ( T \right) = \left [ \left (\frac{T_c - T}{T_c-T_0} \right ) \cdot \left (\frac{T - T_b}{T_0-T_b} \right ) ^{\frac{T_0 - T_b}{T_c - T_0}} \right ]^{c_t}$
# calculate PBTT
df_weather_2003VE, cumPBTT_2003VE = calculate_PBTT(df_weather_2003VE)
df_weather_2004Fin, cumPBTT_2004Fin = calculate_PBTT(df_weather_2004Fin)
df_weather_2005Fin, cumPBTT_2005Fin = calculate_PBTT(df_weather_2005Fin)
df_weather_2010ET, cumPBTT_2010ET = calculate_PBTT(df_weather_2010ET)
df_weather_1999NL, cumPBTT_1999NL = calculate_PBTT(df_weather_1999NL)
# plotly graph for overlapping genotypes
layout = go.Layout(
width=500,
height=300,
title='Cumulative photo-beta thermal time per study',
xaxis={'title':'study', 'dtick':1, 'tickangle': 45, },
yaxis={'title':'cumulative PBTT', 'dtick': 10},
margin=dict(l=20, r=10, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
studies = ['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET']
values = [cumPBTT_1999NL, cumPBTT_2003VE, cumPBTT_2004Fin, cumPBTT_2005Fin, cumPBTT_2010ET]
values = [round(v,2) for v in values]
cpbtt = {'1999NL': values[0], '2003VE': values[1], '2004Fin': values[2],
'2005Fin': values[3], '2010ET': values[4]}
fig = go.Figure(layout=layout)
fig.add_trace(go.Bar(
x=studies,
y=[cpbtt[k] for k in studies],
text=[round(v,2) for v in values],
textposition='auto',
marker_color=colors[:5],
opacity = 0.8
))
fig.show()
paula_cpbtt = {'1999NL': 48.75, '2003VE': 22.35, '2004Fin': 32.44, '2005Fin': 40.85, '2010ET': 24.62}
print('Cumulative PBTT differences from Paula\'s (Paula\'s - calculated here):')
for (s, pVal, eVal) in zip(studies, [paula_cpbtt[k] for k in studies], values):
print('For {:>7}, {:4.2f} - {:4.2f} = {:+4.2f}'.format(s, pVal, eVal, round(pVal - eVal,2)))
Cumulative PBTT differences from Paula's (Paula's - calculated here): For 1999NL, 48.75 - 56.18 = -7.43 For 2003VE, 22.35 - 23.11 = -0.76 For 2004Fin, 32.44 - 31.31 = +1.13 For 2005Fin, 40.85 - 39.30 = +1.55 For 2010ET, 24.62 - 24.13 = +0.49
# add columns to df, with values and index of min / max
df_performance = df_averageWeightPerGenotype_all.set_index('bmID', inplace=False)
minCol = df_performance.idxmin(axis=1)
df_performance['max_study'] = df_performance.idxmax(axis=1)
df_performance['max_val'] = df_performance.max(axis=1)
df_performance['min_val'] = df_performance.min(axis=1)
df_performance['min_study'] = minCol
#list(df_performance.columns.values) - sorted in order of ascending cPBTT:
# ['2003VE', '2010ET', '2004Fin', '2005Fin', '1999NL']
df_perf_ordered = df_performance[['2003VE', '2010ET', '2004Fin', '2005Fin', '1999NL']]
# plotly graph for performance - only best and worst
layout = go.Layout(
# legend=dict(
# x=0, y=1,
# orientation='h',
# title_font_family='Tahoma',
# font=dict(family='Tahoma', size=9, color='black'),
# bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
# ),
width=1000,
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'cumulative PBTT', 'tickangle': 85},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for index, row in df_performance.iterrows():
if cpbtt[row['min_study']] > cpbtt[row['max_study']]:
col = 'darkred'
else:
col= 'darkgreen'
label = index + ': ' + row['max_study'] + '-' + row['min_study']
fig.add_traces(go.Scatter(x=[cpbtt[row['min_study']], cpbtt[row['max_study']]],
y=[row['min_val'],row['max_val']],
line=dict(color=col),
name=label
))
fig.show()
# pio.write_image(fig, '7-3_min-and-max.pdf', width=1100, height=750)
# plotly graph for performance - all (not only best and worst)
layout = go.Layout(
# legend=dict(
# x=0, y=1,
# orientation='h',
# title_font_family='Tahoma',
# font=dict(family='Tahoma', size=9, color='black'),
# bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
# ),
width=1000,
height=450,
title='Tuber weight per genotype (averaged by plant)',
xaxis={'title':'cumulative PBTT', 'tickangle': 85},
yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
margin=dict(l=20, r=0, t=40, b=10),
paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
studies_ordered = ['2003VE', '2010ET', '2004Fin', '2005Fin', '1999NL']
cpbtt_labels = [cpbtt[s] for s in studies_ordered]
at_least_2 = []
at_least_3 = []
at_least_4 = []
at_least_5 = []
points = []
monotonic_counter = 0
for index, row in df_performance.iterrows():
values_ordered = [row[s] for s in studies_ordered]
comparable_values = [v for v in values_ordered if not pd.isnull(v)]
if monotonic(comparable_values):
col = 'darkgreen'
if len(comparable_values) > 4: at_least_5.append(index)
elif len(comparable_values) > 3: at_least_4.append(index)
elif len(comparable_values) > 2: at_least_3.append(index)
elif len(comparable_values) > 1: at_least_2.append(index)
else: points.append(index)
monotonic_counter += 1
else:
col= 'darkred'
label = index #+ ': ' + row['max_study'] + '-' + row['min_study']
fig.add_traces(go.Scatter(x=cpbtt_labels,
y=values_ordered,
line=dict(color=col),
name=label
))
fig.show()
print('1: ' + str(len(points)))
print('2: ' + str(len(at_least_2)))
print('3: ' + str(len(at_least_3)))
print('4: ' + str(len(at_least_4)) + ' ' + str(at_least_4))
print('5: ' + str(len(at_least_5)))
print(monotonic_counter)
1: 38 2: 36 3: 27 4: 23 ['WUR:CE171', 'WUR:CE277', 'WUR:CE298', 'WUR:CE350', 'WUR:CE615', 'WUR:CE628', 'WUR:CE632', 'WUR:CE647', 'WUR:CE651', 'WUR:CE665', 'WUR:CE666', 'WUR:CE675', 'WUR:CE678', 'WUR:CE687', 'WUR:CE696', 'WUR:CE699', 'WUR:CE720', 'WUR:CE723', 'WUR:CE724', 'WUR:CE761', 'WUR:CE765', 'WUR:CE776', 'WUR:CE778'] 5: 26 150